Load Libraries

# Data manipulation and visualization 
library(tidyverse)
library(here)
library(janitor)
library(patchwork)
#library(grid)
#library(gridExtra)
#library(ghibli)
#library(RColorBrewer)
#library(gt)
#library(gtExtras)

# Data analysis/statistics 
library(car) #ANOVA
library(ggeffects)
library(effects) #dependency for ggeffects
library(ggResidpanel) #residual panel
library(GGally)
library(arm) #discrete.histogram() function
library(MASS) #required for arm package

Load the functions

dispersion <- function (model){
  sum(residuals(model, type = "pearson")^2)/(length(model$y)-length(model$coefficients))
}

Load data

The variables with are interested in are: - turf algae (column ‘ta’) - hard coral (column ‘hard_coral’) - sand (column ‘sand’) - crustose coralline algae (column ‘cca’)

fish <- read_csv(here("HW5","data","CRCP_Reef_Fish_Surveys_kole.csv"))

Question 1: Predictor Correlations

fish1 <- fish[,c("ta","hard_coral", "sand", "cca")]
ggpairs(fish1)

Question 2: Poisson distribution with single predictor model

For each of the four predictors, make single-predictor models where the response variable is kole counts. Consider which of the predictors could be transformed to reduce skew along the x-axis. For educational purposes, start by using a poisson distribution for the counts.

Turf Algae

#Based on the frequency distribution of the turf algae (ta) data, the predictor does not seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$ta)

ta_lm <- lm(count ~ ta, data = fish)
summary(ta_lm)
## 
## Call:
## lm(formula = count ~ ta, data = fish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.110 -11.896  -6.206   4.173 111.395 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.11012    3.22407   9.029  < 2e-16 ***
## ta          -0.26263    0.05234  -5.018 8.85e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.31 on 308 degrees of freedom
## Multiple R-squared:  0.07557,    Adjusted R-squared:  0.07257 
## F-statistic: 25.18 on 1 and 308 DF,  p-value: 8.854e-07
resid_panel(ta_lm, plots = c("resid", "qq", "lev", "hist"))

ta_glm <- glm(count ~ ta, family= poisson, data = fish)
summary(ta_glm)
## 
## Call:
## glm(formula = count ~ ta, family = poisson, data = fish)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.5784202  0.0352931   101.4   <2e-16 ***
## ta          -0.0177069  0.0006581   -26.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6862.4  on 308  degrees of freedom
## AIC: 7783.4
## 
## Number of Fisher Scoring iterations: 6
resid_panel(ta_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(ta_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(ta_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff

plot(allEffects(ta_glm))

Anova(ta_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##    LR Chisq Df Pr(>Chisq)    
## ta   710.68  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$ta, prob.col = 'orange', xlim = c(0, 100), ylim = c(0,0.18))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.15, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

Overdispersion: Poisson and Binomial distributions makes assumptions on the variability of the data which is used for the calculations of likelihood. Larger variability will result in smaller Confidence Intervals (CI) and p-values, which is inaccurate.
With a dispersion of 30.10 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

dispersion(ta_glm)
## [1] 30.10215

Hard Coral

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$hard_coral)

coral_lm <- lm(count ~ hard_coral, data = fish)
summary(coral_lm)
## 
## Call:
## lm(formula = count ~ hard_coral, data = fish)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.440  -9.619  -6.177   3.039 111.774 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.31592    1.69524   3.136  0.00188 ** 
## hard_coral   0.43034    0.06312   6.818 4.88e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.69 on 308 degrees of freedom
## Multiple R-squared:  0.1311, Adjusted R-squared:  0.1283 
## F-statistic: 46.49 on 1 and 308 DF,  p-value: 4.876e-11
resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))

coral_glm <- glm(count ~ hard_coral, family= poisson, data = fish)
summary(coral_glm)
## 
## Call:
## glm(formula = count ~ hard_coral, family = poisson, data = fish)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 2.0379324  0.0257021   79.29   <2e-16 ***
## hard_coral  0.0243714  0.0006953   35.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6481.7  on 308  degrees of freedom
## AIC: 7402.7
## 
## Number of Fisher Scoring iterations: 6
resid_panel(coral_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(coral_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(coral_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff

plot(allEffects(coral_glm))

Anova(coral_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##            LR Chisq Df Pr(>Chisq)    
## hard_coral   1091.4  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$hard_coral, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(coral_glm)
## [1] 28.13073

With a dispersion of 28.13 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

Sand

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$sand)

coral_lm <- lm(count ~ sand, data = fish)
summary(coral_lm)

resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
sand_glm <- glm(count ~ sand, family= poisson, data = fish)
summary(sand_glm)
## 
## Call:
## glm(formula = count ~ sand, family = poisson, data = fish)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.946727   0.018964  155.39   <2e-16 ***
## sand        -0.035770   0.001685  -21.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6998.7  on 308  degrees of freedom
## AIC: 7919.6
## 
## Number of Fisher Scoring iterations: 6
resid_panel(sand_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(sand_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(sand_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff

plot(allEffects(sand_glm))

Anova(sand_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##      LR Chisq Df Pr(>Chisq)    
## sand   574.42  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$sand, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(sand_glm)
## [1] 30.97697

With a dispersion of 30.98 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

CCA

#Based on the frequency distribution of the hard coral data, the predictor seem to be suited for the Poisson distribution. 
par(mfrow = c(1,2))
discrete.histogram(fish$count)
discrete.histogram(fish$cca)

cca_lm <- lm(count ~ cca, data = fish)
summary(coral_lm)

resid_panel(coral_lm, plots = c("resid", "qq", "lev", "hist"))
cca_glm <- glm(count ~ cca, family= poisson, data = fish)
summary(cca_glm)
## 
## Call:
## glm(formula = count ~ cca, family = poisson, data = fish)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 2.270516   0.021449  105.85   <2e-16 ***
## cca         0.034430   0.001153   29.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 7573.1  on 309  degrees of freedom
## Residual deviance: 6846.7  on 308  degrees of freedom
## AIC: 7767.6
## 
## Number of Fisher Scoring iterations: 6
resid_panel(cca_glm, plots = c("resid", "qq", "lev", "hist"))

resid_xpanel(cca_glm)

# lecture notes suggest using plot(log.y = TRUE) but I get an error. 
ggeff <- ggeffect(cca_glm) %>% plot()+ scale_y_continuous(trans = "log")
ggeff

plot(allEffects(cca_glm))

Anova(cca_glm)
## Analysis of Deviance Table (Type II tests)
## 
## Response: count
##     LR Chisq Df Pr(>Chisq)    
## cca   726.46  1  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
discrete.histogram(fish$cca, prob.col = 'orange', xlim = c(0, 75))
lines(0:25, dpois(0:25, lambda = mean(fish$count)), col = 'blue',
lwd = 4, type = 'b')
legend(12, 0.20, lwd = 4, col = 'blue', lty = 1, legend = "Poisson
prediction")

dispersion(cca_glm)
## [1] 30.8293

With a dispersion of 30.83 (>>>1), the data is over dispersed and it will result in smaller CI and p-values than it should.

Answer: Relationship between Kole and the predictors

  • There is a positive relationship between Kole and hard corals, and CCA. An increase in those predictors will result in an significant increase in Kole count.
  • There is a negative relationship between Kole and turf algae, and sand. An increase in those predictors will result in an significant decrease in Kole count

Question 3: dealing with overdispersion in a model

To deal with over-dispersion, we can use a negative binomial distribution or use random effects.

Question: including all substrate types in a single model